Run Libraries

rm(list=ls())

library(pheatmap)
library(ggplot2)
library(dichromat)
library(dplyr)
library(ggrepel)
library(reshape2)
library(umap)
library(ggthemes)
library(cowplot)
library(DEP)
library(readr)
library(naniar)
library(SummarizedExperiment)
library(data.table)
library(readxl)
library(writexl)
library(stringr)
library(vsn)
library(rmarkdown)
library(tidyr)
library(Polychrome)
library(RColorBrewer)
# install.packages("devtools")
# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# options(repos = c(
#     CRAN = "https://cloud.r-project.org/",
#     BiocManager::repositories()))
# ## Install dependencies and package
# devtools::install_github(
#     "BioinfOMICS/LipidSigR",
#     build_vignettes = TRUE, dependencies = TRUE)
# BiocManager::install(
#     c('fgsea', 'gatom', 'mixOmics', 'S4Vectors', 'BiocGenerics',
#       'SummarizedExperiment', 'rgoslin'))
# install.packages(
#     c('devtools', 'magrittr', 'plotly', 'tidyverse', 'factoextra', 'ggthemes',
#       'ggforce', 'Hmisc', 'hwordcloud', 'heatmaply', 'iheatmapr', 'Rtsne', 'uwot',
#       'wordcloud', 'rsample', 'ranger', 'caret', 'yardstick', 'fastshap',
#       'SHAPforxgboost', 'visNetwork', 'tidygraph', 'ggraph'))
# devtools::install_github("ctlab/mwcsr")

Set working directories

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# create directory for results
dir.create(file.path(getwd(),'results'), showWarnings = FALSE)
# create directory for plots
dir.create(file.path(getwd(),'plots'), showWarnings = FALSE)

Load data

Loading data including:

  • All templates: information on the tube ID, rack, visit, country, type of fluid, position
  • Shipment of all boxes file: information on the patient ID, visit, position
  • Raw data: lipidomics expression with info on the tube ID for CSF and Plasma

Step 1: merge info of all patients

The first step is to merge the information of the “All templates” and “Shipment of all boxes” files, and create 3 datasets (one for plasma, one for CSF, one for urine) with the information of Patient ID, Position, Biofluid, Visit, Tube number, Country, Rack and Box.

- CSF

## sample info - CSF
paged_table(lipidomics_CSF_all)
dim(lipidomics_CSF_all)
## [1] 415   8
lipidomics_CSF_all_summary <- lipidomics_CSF_all %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE),
    total_tubes = n_distinct(Tube_number, na.rm = TRUE),
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_CSF_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_CSF_all_V0 = lipidomics_CSF_all %>%
  filter(Visit == "V0")
paged_table(lipidomics_CSF_all_V0 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_CSF_all_V0 %>%
  filter(Patient %in% lipidomics_CSF_all_V0[duplicated(lipidomics_CSF_all_V0$Patient) & !is.na(lipidomics_CSF_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_CSF_all_V1 = lipidomics_CSF_all %>%
  filter(Visit == "V1")
paged_table(lipidomics_CSF_all_V1 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_CSF_all_V1 %>%
  filter(Patient %in% lipidomics_CSF_all_V1[duplicated(lipidomics_CSF_all_V1$Patient) & !is.na(lipidomics_CSF_all_V1$Patient),]$Patient))

- Plasma

## sample info - plasma
paged_table(lipidomics_plasma_all)
dim(lipidomics_plasma_all)
## [1] 415   8
lipidomics_plasma_all_summary <- lipidomics_plasma_all %>%
  group_by(Visit) %>%
   summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE),
    total_tubes = n_distinct(Tube_number, na.rm = TRUE),
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_plasma_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_plasma_all_V0 = lipidomics_plasma_all %>%
  filter(Visit == "V0")
paged_table(lipidomics_plasma_all_V0 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_plasma_all_V0 %>%
  filter(Patient %in% lipidomics_plasma_all_V0[duplicated(lipidomics_plasma_all_V0$Patient) & !is.na(lipidomics_plasma_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_plasma_all_V1 = lipidomics_plasma_all %>%
  filter(Visit == "V1")
paged_table(lipidomics_plasma_all_V1 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_plasma_all_V1 %>%
  filter(Patient %in% lipidomics_plasma_all_V1[duplicated(lipidomics_plasma_all_V1$Patient) & !is.na(lipidomics_plasma_all_V1$Patient),]$Patient))

- Urine

## sample info - urine
paged_table(lipidomics_urine_all)
dim(lipidomics_urine_all)
## [1] 384   8
lipidomics_urine_all_summary <- lipidomics_urine_all %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE),
    total_tubes = n_distinct(Tube_number, na.rm = TRUE),
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)]))
lipidomics_urine_all_summary
# which tube IDs are missing for Patient IDs at V0
lipidomics_urine_all_V0 = lipidomics_urine_all %>%
  filter(Visit == "V0")
paged_table(lipidomics_urine_all_V0 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V0
paged_table(lipidomics_urine_all_V0 %>%
  filter(Patient %in% lipidomics_urine_all_V0[duplicated(lipidomics_urine_all_V0$Patient) & !is.na(lipidomics_urine_all_V0$Patient),]$Patient))
# which tube IDs are missing for Patient IDs at V1
lipidomics_urine_all_V1 = lipidomics_urine_all %>%
  filter(Visit == "V1")
paged_table(lipidomics_urine_all_V1 %>%
  filter(is.na(Tube_number) & !is.na(Patient)))
# check which Patient IDs are duplicated at V1
paged_table(lipidomics_urine_all_V1 %>%
  filter(Patient %in% lipidomics_urine_all_V1[duplicated(lipidomics_urine_all_V1$Patient) & !is.na(lipidomics_urine_all_V1$Patient),]$Patient))

Step 2: aggregate all raw files

The second step is to aggregate all the raw data files together and save the batch information, in order to check later if there are any batch effects in the data.

- CSF

paged_table(raw_data_CSF_final)
dim(raw_data_CSF_final)
## [1] 174 894
# how many rows of QC
length(grep("QC",raw_data_CSF_final$Tube_number))
## [1] 30

- Plasma

paged_table(raw_data_plasma_final)
dim(raw_data_plasma_final)
## [1] 382 894
# how many rows of QC
length(grep("QC",raw_data_plasma_final$Tube_number))
## [1] 66

Overview of data

Step 3: Combine raw data files with patient info

The third step is to combine the raw data files with the patient information, and check the amount of samples per visit for each fluid.

- CSF

raw_data_CSF_IDs = raw_data_CSF_final %>%
  select(Tube_number,Batch)
raw_data_CSF_IDs <- raw_data_CSF_IDs %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
  filter(!str_detect(Tube_number, "QC")) # remove 30 QC IDs

# how many IDs in the raw data CSF
length(raw_data_CSF_IDs$Tube_number)
## [1] 144
# how many unique IDs in the raw data CSF
length(unique(raw_data_CSF_IDs$Tube_number))
## [1] 144
lipidomics_CSF_all <- lipidomics_CSF_all %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_CSF = raw_data_CSF_IDs %>%
  left_join(lipidomics_CSF_all,by = "Tube_number") %>%
  mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_CSF = full_match_CSF %>%
  filter(is.na(match_type)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.y) %>%
  select(-names(lipidomics_CSF_all)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_CSF = no_match_CSF %>%
  left_join(lipidomics_CSF_all %>% 
              filter(!Tube_number %in% (full_match_CSF %>% filter(!is.na(match_type)) %>%
                                          pull(Tube_number))),
            by = "Tube_number4") %>%
  mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))

# final description of the samples available in the raw data without QC samples
combined_data_CSF = full_match_CSF %>%
  filter(!is.na(match_type)) %>%
  select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
  bind_rows(last4_match_CSF %>% 
              select(-Tube_number) %>% 
              left_join(raw_data_CSF_IDs %>% select(Tube_number,Tube_number4) %>%
                          filter(!Tube_number %in% (full_match_CSF %>% filter(!is.na(match_type)) %>%
                                          pull(Tube_number)))) %>%
              select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
  distinct()

paged_table(combined_data_CSF)
# how many IDs in the raw data CSF now 
length(combined_data_CSF$Tube_number)
## [1] 144
# how many unique IDs in the raw data CSF now
length(unique(combined_data_CSF$Tube_number))
## [1] 144
# how many unique Patient IDs
length(unique(na.omit(combined_data_CSF$Patient)))
## [1] 112
# how many tube numbers and patient IDs per V0 and V1
combined_data_CSF %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
    total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# which Tube numbers at V1 do not have a patient ID
combined_data_CSF %>%
  filter(Visit == "V1") %>%
  filter(is.na(Patient))
# patients at V0 and V1
patients_CSF_V0 <- combined_data_CSF %>%
  filter(Visit == "V0") %>%
  pull(Patient) %>%
  unique()
patients_CSF_V1 <- combined_data_CSF %>%
  filter(Visit == "V1") %>%
  pull(Patient) %>%
  unique() %>% 
  na.omit()

# check if all patients at V1 are at V0
all(patients_CSF_V1 %in% patients_CSF_V0)
## [1] FALSE
patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0] # patient TR319
## [1] "TR319"
# check patient in the info_patient dataset
lipidomics_CSF_all[lipidomics_CSF_all$Patient %in% patients_CSF_V1[!patients_CSF_V1 %in% patients_CSF_V0],]

- Plasma

raw_data_plasma_IDs = raw_data_plasma_final %>%
  select(Tube_number,Batch)
raw_data_plasma_IDs <- raw_data_plasma_IDs %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) %>% # last 4 digits
  filter(!str_detect(Tube_number, "QC")) # remove 66 QC IDs

# how many IDs in the raw data plasma
length(raw_data_plasma_IDs$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma
length(unique(raw_data_plasma_IDs$Tube_number))
## [1] 316
lipidomics_plasma_all <- lipidomics_plasma_all %>%
  mutate(Tube_number4 = str_sub(Tube_number,-4)) # last 4 digits
full_match_plasma = raw_data_plasma_IDs %>%
  left_join(lipidomics_plasma_all,by = "Tube_number") %>%
  mutate(match_type = ifelse(!is.na(Tube_number4.y),"Full",NA))
no_match_plasma = full_match_plasma %>%
  filter(is.na(match_type)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.y) %>%
  select(-names(lipidomics_plasma_all)) %>%
  dplyr::rename(Tube_number4 = Tube_number4.x)
last4_match_plasma = no_match_plasma %>%
  left_join(lipidomics_plasma_all,by = "Tube_number4") %>%
  mutate(match_type = ifelse(!is.na(Tube_number),"Last4","No match"))

# final description of the samples available in the raw data without QC samples
combined_data_plasma = full_match_plasma %>%
  filter(!is.na(match_type)) %>%
  select(-c(Tube_number4.x,Tube_number4.y,match_type)) %>%
  bind_rows(last4_match_plasma %>% 
              select(-Tube_number) %>% 
              left_join(raw_data_plasma_IDs %>% select(Tube_number,Tube_number4)) %>%
              select(Tube_number,Patient,Position,Biofluid,Visit,Country,Rack,Box,Batch)) %>%
  distinct()

paged_table(combined_data_plasma)
# how many IDs in the raw data plasma now 
length(combined_data_plasma$Tube_number)
## [1] 316
# how many unique IDs in the raw data plasma now
length(unique(combined_data_plasma$Tube_number))
## [1] 316
# how many unique Patient IDs
length(unique(na.omit(combined_data_plasma$Patient)))
## [1] 200
# how many tube numbers and patient IDs per V0 and V1
combined_data_plasma %>%
  group_by(Visit) %>%
  summarise(
    total_patients = n_distinct(Patient, na.rm = TRUE), # amount of patient IDs
    total_tubes = n_distinct(Tube_number, na.rm = TRUE), # amount of tube numbers
    total_patients_tubes_pairs = n_distinct(
      paste(Patient, Tube_number)[!is.na(Patient) & !is.na(Tube_number)])) # amount of pairs
# which Tube numbers at V0 do not have a patient ID
combined_data_plasma %>%
  filter(Visit == "V0") %>%
  filter(is.na(Patient))
# which Tube numbers at V1 do not have a patient ID
combined_data_plasma %>%
  filter(Visit == "V1") %>%
  filter(is.na(Patient))
# patients at V0 and V1
patients_plasma_V0 <- combined_data_plasma %>%
  filter(Visit == "V0") %>%
  pull(Patient) %>%
  unique()
patients_plasma_V1 <- combined_data_plasma %>%
  filter(Visit == "V1") %>%
  pull(Patient) %>%
  unique() %>% 
  na.omit()

# check if all patients at V1 are at V0
all(patients_plasma_V1 %in% patients_plasma_V0)
## [1] FALSE
patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0] # patients TR207, TR116 and TR123
## [1] "TR207" "TR116" "TR123"
# check patient in the info_patient dataset
lipidomics_plasma_all[lipidomics_plasma_all$Patient %in% patients_plasma_V1[!patients_plasma_V1 %in% patients_plasma_V0],]

Step 4: Overview of the samples based on status

The fourth step is to give an overview of how many samples of eALS, PGMC, CTR and mimic we have per fluid, visit and country.

- CSF (the patient IDs given by Laura)

# IDs in CSF (131)
IDs_CSF <- read_delim("data input/export-2025-11-07-PREMODIALS-AKDTR_BRNO_CHUFR_HMCIL_HRO_KSSGCH_MRI_NIUSASSK/CSFAcquisition.csv",
                          delim = ";", escape_double = FALSE, trim_ws = TRUE) %>%
  filter(CsfSexists == 1) %>%
  group_by(PatientID) %>%
  mutate(Visit = paste0("V", row_number() - 1)) %>%
  ungroup() %>% 
  select(PatientID,Visit) %>% 
  distinct()
IDs_CSF_data_1 = IDs_CSF %>%
  left_join(GeneralDocumentation %>% select(PatientID,ParticipantCode)) %>%
  dplyr::rename(Patient = ParticipantCode) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      grepl("IL", Patient) ~ "Israel",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_CSF_data_1)
overview_country_CSF_1 = IDs_CSF_data_1 %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_CSF_1
overview_status_CSF_1 = IDs_CSF_data_1 %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,`NA`)
overview_status_CSF_1

- CSF (the patient IDs existing in the raw data)

IDs_CSF_data = combined_data_CSF %>%
  select(Patient,Visit,Tube_number) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      grepl("IL", Patient) ~ "Israel",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_CSF_data)
overview_country_CSF = IDs_CSF_data %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_CSF
overview_status_CSF = IDs_CSF_data %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,N.A.,`NA`)
overview_status_CSF

- Plasma (the patient IDs given by Laura)

# IDs in plasma (235)
IDs_plasma <- read_delim("data input/export-2025-11-07-PREMODIALS-AKDTR_BRNO_CHUFR_HMCIL_HRO_KSSGCH_MRI_NIUSASSK/BloodAcquisition.csv",
                          delim = ";", escape_double = FALSE, trim_ws = TRUE) %>%
  filter(BAexists == 1) %>% 
  group_by(PatientID) %>%
  mutate(Visit = paste0("V", row_number() - 1)) %>%
  ungroup() %>% 
  select(PatientID,Visit) %>% 
  distinct()

IDs_plasma_data_1 = IDs_plasma %>%
  left_join(GeneralDocumentation %>% select(PatientID,ParticipantCode)) %>%
  dplyr::rename(Patient = ParticipantCode) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      grepl("IL", Patient) ~ "Israel",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_plasma_data_1)
overview_country_plasma_1 = IDs_plasma_data_1 %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_plasma_1
overview_status_plasma_1 = IDs_plasma_data_1 %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,`NA`)
overview_status_plasma_1

- Plasma (the patient IDs existing in the raw data)

IDs_plasma_data = combined_data_plasma %>%
  select(Patient,Visit,Tube_number) %>%
  distinct() %>%
  left_join(all_participants_IDs %>% dplyr::rename(Patient = ParticipantCode)) %>%
  mutate(
    Country = dplyr::case_when(
      grepl("TR", Patient) ~ "Turkey",
      grepl("CH", Patient) ~ "Switzerland",
      grepl("DE", Patient) ~ "Germany",
      grepl("SK", Patient) ~ "Slovakia",
      grepl("FR", Patient) ~ "France",
      grepl("IL", Patient) ~ "Israel",
      TRUE                 ~ NA_character_
    )) %>%
  arrange(Patient)

paged_table(IDs_plasma_data)
overview_country_plasma = IDs_plasma_data %>%
  group_by(Visit,Country) %>%
  summarise(n_unique_patients = n_distinct(Patient),.groups = "drop") %>%
  arrange(Visit,desc(n_unique_patients))
overview_country_plasma
overview_status_plasma = IDs_plasma_data %>%
  group_by(type,Visit,Country) %>% 
  distinct() %>%
  summarise(nr_samples = n_distinct(Patient)) %>%
  select(type, Visit,Country,nr_samples) %>%
  pivot_wider(
    names_from  = type,        
    values_from = nr_samples,  
    values_fill = 0) %>%
  select(Visit,Country,PGMC,CTR,ALS,mimic,SYMP,C9orf72,SOD1,TARDBP,FUS,other,N.A.,`NA`)
overview_status_plasma

Bioinformatic analyses

Step 5: creation of summarised experiment datasets

The fifth step is to remove the QC rows, save the experimental setup (info of patient ID, batch, sample ID, visit, etc.), structure the lipidomics dataset to numeric and create summarised experiments datasets.

- CSF

# removal of QC rows
raw_data_CSF_use = raw_data_CSF_final %>%
  filter(!str_detect(Tube_number, "QC"))

tube_batch_CSF = raw_data_CSF_use[,1:2] %>%
  left_join(combined_data_CSF %>% select(Tube_number,Batch,Patient,Visit))
raw_data_CSF_use_mat = as.matrix(raw_data_CSF_use[,3:ncol(raw_data_CSF_use)])
rownames(raw_data_CSF_use_mat) = tube_batch_CSF %>% pull(Tube_number)

# matrix with raw lipidomics data (rows - lipids and column - tube ID)
lipidomics_CSF = as.data.frame(t(raw_data_CSF_use_mat))
dim(lipidomics_CSF)
## [1] 892 144
# make columns numeric
for(i in 1:ncol(lipidomics_CSF)){lipidomics_CSF[,i] = as.numeric(lipidomics_CSF[,i])}

#make a list for all dataframes
lipidomics_data_CSF = list(raw_data = lipidomics_CSF)

## summarized objects 
tube_batch_CSF = tube_batch_CSF %>% 
  left_join(IDs_CSF_data) %>%
  mutate(subtype = ifelse(type == "C9orf72","C9orf72",
                          ifelse(type == "SOD1","SOD1",
                                 ifelse(type == "FUS","FUS",
                                        ifelse(type == "other","other",
                                               ifelse(type == "TARDBP","TARDBP",NA)))))) %>%
  mutate(type = ifelse(type %in% c("C9orf72","SOD1","FUS","TARDBP","other"),
                       "PGMC",type)) %>%
  arrange(subtype) %>%
  distinct(Tube_number, Batch,Patient,PatientID,type, .keep_all = TRUE)

# Check which tube numbers do not have a patient ID or type assigned (majority uncertain PURE MOTOR SYMPTOM (PMS) / MND MIMIC / EARLY ALS (EALS))
tube_number_CSF_missing = tube_batch_CSF %>%
  filter(is.na(Patient) | is.na(type)) %>%
  arrange(Patient)
tube_number_CSF_missing
GeneralDocumentation %>%
  select(ParticipantCode,PatientID,PGMC,ALSuncertainty,ALSFUdiagnosis) %>%
  filter(ParticipantCode %in% tube_number_CSF_missing$Patient) %>%
  filter(!is.na(ParticipantCode))
abundance.columns <- 1:ncol(lipidomics_data_CSF$raw_data) # get abundance column numbers
      clin = data.frame(label = tube_batch_CSF$Tube_number, 
                        condition = tube_batch_CSF$type,
                        sub_condition = tube_batch_CSF$subtype,
                        batch = tube_batch_CSF$Batch)
      
      lipidomics_data_CSF$raw_data$name = lipidomics_data_CSF$raw_data$ID = rownames(lipidomics_data_CSF$raw_data)
      experimental.design = clin
      experimental.design <- experimental.design %>%
        group_by(condition) %>%            
        mutate(replicate = row_number()) %>%
        ungroup()
      
      lipidomics_data_CSF$se <- make_se(lipidomics_data_CSF$raw_data, abundance.columns, experimental.design)
lipidomics_data_CSF$se
## class: SummarizedExperiment 
## dim: 892 144 
## metadata(0):
## assays(1): ''
## rownames(892): CE(18:1-d7) Cholesterol(d7) ... PS(38:4)_PS(18:0/20:4)
##   PS(40:4)_PS(20:0/20:4)
## rowData names(2): ID name
## colnames(144): PGMC_1 PGMC_2 ... CTR_26 CTR_27
## colData names(6): label ID ... batch replicate
 writexl::write_xlsx(lipidomics_data_CSF$raw_data,"results/lipidomics_CSF_raw_data.xlsx")

- Plasma

# removal of QC rows
raw_data_plasma_use = raw_data_plasma_final %>%
  filter(!str_detect(Tube_number, "QC"))

tube_batch_plasma = raw_data_plasma_use[,1:2] %>%
  left_join(combined_data_plasma %>% select(Tube_number,Batch,Patient,Visit))
raw_data_plasma_use_mat = as.matrix(raw_data_plasma_use[,3:ncol(raw_data_plasma_use)])
rownames(raw_data_plasma_use_mat) = tube_batch_plasma %>% pull(Tube_number)

# matrix with raw lipidomics data (rows - lipids and column - tube ID)
lipidomics_plasma = as.data.frame(t(raw_data_plasma_use_mat))
dim(lipidomics_plasma)
## [1] 892 316
# make columns numeric
for(i in 1:ncol(lipidomics_plasma)){lipidomics_plasma[,i] = as.numeric(lipidomics_plasma[,i])}

#make a list for all dataframes
lipidomics_data_plasma = list(raw_data = lipidomics_plasma)

## summarized objects 
tube_batch_plasma = tube_batch_plasma %>% 
  left_join(IDs_plasma_data) %>%
  mutate(subtype = ifelse(type == "C9orf72","C9orf72",
                          ifelse(type == "SOD1","SOD1",
                                 ifelse(type == "FUS","FUS",
                                        ifelse(type == "other","other",
                                               ifelse(type == "TARDBP","TARDBP",NA)))))) %>%
  mutate(type = ifelse(type %in% c("C9orf72","SOD1","FUS","TARDBP","other"),
                       "PGMC",type)) %>%
  arrange(subtype) %>%
  distinct(Tube_number, Batch,Patient,PatientID,type, .keep_all = TRUE)

# Check which tube numbers do not have a patient ID or type assigned (majority uncertain PURE MOTOR SYMPTOM (PMS) / MND MIMIC / EARLY ALS (EALS))
tube_number_plasma_missing = tube_batch_plasma %>%
  filter(is.na(Patient) | is.na(type)) %>%
  arrange(Patient)
tube_number_plasma_missing
GeneralDocumentation %>%
  select(ParticipantCode,PatientID,PGMC,ALSuncertainty,ALSFUdiagnosis) %>%
  filter(ParticipantCode %in% tube_number_plasma_missing$Patient) %>%
  filter(!is.na(ParticipantCode))
abundance.columns <- 1:ncol(lipidomics_data_plasma$raw_data) # get abundance column numbers
      clin = data.frame(label = tube_batch_plasma$Tube_number, 
                        condition = tube_batch_plasma$type,
                        sub_condition = tube_batch_plasma$subtype,
                        batch = tube_batch_plasma$Batch)
      
      lipidomics_data_plasma$raw_data$name = lipidomics_data_plasma$raw_data$ID = rownames(lipidomics_data_plasma$raw_data)
      experimental.design = clin
      experimental.design <- experimental.design %>%
        group_by(condition) %>%            
        mutate(replicate = row_number()) %>%
        ungroup()
      
      lipidomics_data_plasma$se <- make_se(lipidomics_data_plasma$raw_data, abundance.columns, experimental.design)
 lipidomics_data_plasma$se
## class: SummarizedExperiment 
## dim: 892 316 
## metadata(0):
## assays(1): ''
## rownames(892): CE(18:1-d7) Cholesterol(d7) ... PS(38:4)_PS(18:0/20:4)
##   PS(40:4)_PS(20:0/20:4)
## rowData names(2): ID name
## colnames(316): PGMC_1 PGMC_2 ... mimic_32 ALS_91
## colData names(6): label ID ... batch replicate
 writexl::write_xlsx(lipidomics_data_plasma$raw_data,"results/lipidomics_plasma_raw_data.xlsx")

Step 6: missing analyses

The sixth step is to check for missing omics per patient and overall, filter lipids that are not quantified in less than 1/3 of the samples and normalise the data (using vsn).

- CSF

# heatmap missing 
vis_miss(lipidomics_data_CSF$raw_data,show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_CSF.png", width = 15, height = 10, units = "in")

# filter for lipids that are quantified in at least 2/3 of the samples
lipidomics_data_CSF$se_filtered = filter_proteins(lipidomics_data_CSF$se,"fraction",min = 0.66)

#dimensions of the data
dim(lipidomics_data_CSF$se)
## [1] 892 144
dim(lipidomics_data_CSF$se_filtered)
## [1] 186 144
# heatmap missing for filtered data
vis_miss(as.data.frame(assay(lipidomics_data_CSF$se_filtered)),show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_filtered_CSF.png", width = 12, height = 8, units = "in")

plot_frequency(lipidomics_data_CSF$se)

ggsave("plots/frequency_met_identification_raw_CSF.pdf", width = 11, height = 8, units = "in")
plot_frequency(lipidomics_data_CSF$se_filtered)

ggsave("plots/frequency_met_identification_filtered_CSF.pdf", width = 11, height = 8, units = "in")

# % missing per patient: 
round(apply(X = as.data.frame(assay(lipidomics_data_CSF$se)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_CSF$se))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##     77.5     76.3     77.9     77.5     77.4     77.7     77.0     77.2 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##     79.0     79.8     76.9     78.6     76.7     78.6     77.8     77.7 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##     78.3     78.6     77.1     76.6     77.5     77.6     78.9     79.0 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30    CTR_1  mimic_1 
##     78.6     78.0     77.5     77.2     78.4     77.8     76.5     73.0 
##    CTR_2  mimic_2    ALS_1    CTR_3   SYMP_1    ALS_2    ALS_3    CTR_4 
##     77.1     75.8     73.8     73.3     78.3     75.9     76.5     76.9 
##  mimic_3    ALS_4    CTR_5    CTR_6  mimic_4  mimic_5   SYMP_2    ALS_5 
##     76.1     76.2     75.9     77.2     77.0     74.8     78.0     76.3 
##  mimic_6    ALS_6    ALS_7    CTR_7  mimic_7    CTR_8  mimic_8    ALS_8 
##     76.3     78.5     76.1     76.3     76.1     76.8     76.1     75.8 
##    ALS_9    CTR_9   ALS_10  mimic_9   ALS_11   ALS_12   ALS_13 mimic_10 
##     76.6     76.9     75.7     72.5     74.6     73.9     73.5     76.9 
##   ALS_14   SYMP_3   ALS_15   CTR_10 mimic_11   CTR_11   ALS_16   CTR_12 
##     78.3     76.2     78.4     77.7     78.4     78.6     75.0     72.9 
##   CTR_13   CTR_14 mimic_12   CTR_15   CTR_16   SYMP_4 mimic_13   ALS_17 
##     78.4     77.1     77.6     77.6     78.0     77.8     75.4     77.9 
##   ALS_18 mimic_14   CTR_17   ALS_19   ALS_20 mimic_15   ALS_21   ALS_22 
##     79.1     75.6     76.9     77.0     78.6     76.6     79.9     75.8 
##   CTR_18 mimic_16   ALS_23   ALS_24   SYMP_5   CTR_19   ALS_25   ALS_26 
##     77.9     76.0     76.2     75.3     77.9     77.7     78.1     78.8 
##   ALS_27   ALS_28   SYMP_6   ALS_29   ALS_30   SYMP_7   ALS_31   ALS_32 
##     78.5     79.3     80.0     78.0     79.0     78.1     77.5     78.0 
##   ALS_33   CTR_20   SYMP_8 mimic_17   SYMP_9   CTR_21   CTR_22   ALS_34 
##     75.2     79.0     77.9     77.5     76.5     77.2     80.4     81.1 
##   ALS_35   ALS_36   ALS_37   CTR_23   ALS_38   ALS_39   ALS_40 mimic_18 
##     76.6     74.4     79.3     79.1     77.1     78.7     77.9     78.8 
##   ALS_41   N.A._1  SYMP_10   CTR_24   ALS_42   ALS_43   ALS_44   ALS_45 
##     77.5     79.4     78.1     79.0     77.8     76.0     76.3     77.7 
##   CTR_25 mimic_19 mimic_20    NA._1    NA._2   ALS_46   ALS_47   ALS_48 
##     78.8     79.8     77.9     59.3     78.3     78.4     78.0     76.6 
##   ALS_49   N.A._2 mimic_21   ALS_50 mimic_22   ALS_51   CTR_26   CTR_27 
##     74.9     81.6     80.2     77.4     76.9     77.0     78.3     78.5
round(apply(X = as.data.frame(assay(lipidomics_data_CSF$se_filtered)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_CSF$se_filtered))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##      1.6      2.2      5.4      1.1      2.2      2.2      0.5      2.7 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##      6.5      6.5      7.0      3.2      4.8      3.8      3.8      3.8 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##      3.2      4.8      2.2      0.5      2.2      2.7      7.0      3.8 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30    CTR_1  mimic_1 
##      3.2      3.8      3.8      4.8      3.2      2.7      2.2      0.0 
##    CTR_2  mimic_2    ALS_1    CTR_3   SYMP_1    ALS_2    ALS_3    CTR_4 
##      5.4      0.5      0.5      0.0      3.2      1.1      1.1      1.1 
##  mimic_3    ALS_4    CTR_5    CTR_6  mimic_4  mimic_5   SYMP_2    ALS_5 
##      1.6      2.7      1.1      2.7      1.6      2.2      3.8      1.1 
##  mimic_6    ALS_6    ALS_7    CTR_7  mimic_7    CTR_8  mimic_8    ALS_8 
##      2.2      4.8      1.6      1.1      1.1      2.2      1.1      0.5 
##    ALS_9    CTR_9   ALS_10  mimic_9   ALS_11   ALS_12   ALS_13 mimic_10 
##      1.1      1.6      1.1      2.2      0.5      0.5      0.5      1.6 
##   ALS_14   SYMP_3   ALS_15   CTR_10 mimic_11   CTR_11   ALS_16   CTR_12 
##      6.5      2.2      3.2      1.6      3.8      2.7      0.5      2.7 
##   CTR_13   CTR_14 mimic_12   CTR_15   CTR_16   SYMP_4 mimic_13   ALS_17 
##      2.7      1.6      3.8      2.2      2.7      3.8      1.1      2.2 
##   ALS_18 mimic_14   CTR_17   ALS_19   ALS_20 mimic_15   ALS_21   ALS_22 
##      6.5      1.1      1.1      1.1      4.3      2.2      8.1      1.1 
##   CTR_18 mimic_16   ALS_23   ALS_24   SYMP_5   CTR_19   ALS_25   ALS_26 
##      2.2      1.6      1.6      1.1      2.2      2.2      3.2      4.3 
##   ALS_27   ALS_28   SYMP_6   ALS_29   ALS_30   SYMP_7   ALS_31   ALS_32 
##      3.2      5.9      9.1      5.4      6.5      4.3      4.8      2.7 
##   ALS_33   CTR_20   SYMP_8 mimic_17   SYMP_9   CTR_21   CTR_22   ALS_34 
##      3.2      5.4      4.3      3.2      2.2      2.7     11.3     14.0 
##   ALS_35   ALS_36   ALS_37   CTR_23   ALS_38   ALS_39   ALS_40 mimic_18 
##      2.7      1.1      8.1      6.5      1.6      7.0      3.2      5.4 
##   ALS_41   N.A._1  SYMP_10   CTR_24   ALS_42   ALS_43   ALS_44   ALS_45 
##      1.6      6.5      3.2      7.5      4.3      2.7      0.5      2.2 
##   CTR_25 mimic_19 mimic_20    NA._1    NA._2   ALS_46   ALS_47   ALS_48 
##      7.0      9.7      3.8      0.5      3.8      3.8      3.8      1.6 
##   ALS_49   N.A._2 mimic_21   ALS_50 mimic_22   ALS_51   CTR_26   CTR_27 
##      0.5     17.7      8.6      3.2      1.6      3.2      4.8      5.4
#normalization
lipidomics_data_CSF$se_filt_norm <- normalize_vsn(lipidomics_data_CSF$se_filtered)
DEP::meanSdPlot(lipidomics_data_CSF$se_filt_norm)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the vsn package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filtered)),"results/data_filtered_CSF.xlsx")
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filt_norm)),"results/data_filtered_normalized_CSF.xlsx")

# imputation
lipidomics_data_CSF$se_filt_impute <- impute(
  lipidomics_data_CSF$se_filt_norm,fun = "MinProb",q = 0.01)
## Imputing along margin 2 (samples/columns).
## [1] 0.4905946
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_CSF$se_filt_impute)),"results/data_filtered_normalized_imputed_CSF.xlsx")

- Plasma

# heatmap missing 
vis_miss(lipidomics_data_plasma$raw_data,show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_plasma.png", width = 15, height = 10, units = "in")

# filter for lipids that are quantified in at least 2/3 of the samples
lipidomics_data_plasma$se_filtered = filter_proteins(lipidomics_data_plasma$se,"fraction",min = 0.66)

#dimensions of the data
dim(lipidomics_data_plasma$se)
## [1] 892 316
dim(lipidomics_data_plasma$se_filtered)
## [1] 684 316
# heatmap missing for filtered data
vis_miss(as.data.frame(assay(lipidomics_data_plasma$se_filtered)),show_perc = TRUE,show_perc_col = TRUE,cluster = F)

ggsave("plots/missing_vis_miss_heatmap_filtered_plasma.png", width = 12, height = 8, units = "in")

plot_frequency(lipidomics_data_plasma$se)

ggsave("plots/frequency_met_identification_raw_plasma.pdf", width = 11, height = 8, units = "in")
plot_frequency(lipidomics_data_plasma$se_filtered)

ggsave("plots/frequency_met_identification_filtered_plasma.pdf", width = 11, height = 8, units = "in")

# % missing per patient: 
round(apply(X = as.data.frame(assay(lipidomics_data_plasma$se)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_plasma$se))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##     16.6     18.7     19.7     19.1     19.5     18.8     20.0     20.7 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##     22.0     22.6     23.1     23.7     22.3     21.1     21.3     22.2 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##     22.4     22.3     27.9     29.1     24.3     27.5     28.4     24.4 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30  PGMC_31  PGMC_32 
##     26.9     26.8     24.8     23.1     22.1     22.6     22.1     16.8 
##  PGMC_33  PGMC_34  PGMC_35  PGMC_36  PGMC_37  PGMC_38  PGMC_39  PGMC_40 
##     16.7     16.7     18.6     20.2     19.3     17.8     18.0     18.5 
##  PGMC_41  PGMC_42  PGMC_43  PGMC_44  PGMC_45  PGMC_46  PGMC_47  PGMC_48 
##     20.4     20.0     16.6     18.2     18.5     19.1     17.9     18.5 
##  PGMC_49  PGMC_50  PGMC_51  PGMC_52  PGMC_53  PGMC_54  PGMC_55  PGMC_56 
##     17.9     19.8     23.8     25.0     20.0     20.1     21.5     21.3 
##  PGMC_57  PGMC_58  PGMC_59  PGMC_60  PGMC_61  PGMC_62  PGMC_63  PGMC_64 
##     22.4     23.0     27.5     17.4     15.5     17.9     18.2     16.4 
##  PGMC_65  PGMC_66  PGMC_67  PGMC_68  PGMC_69  PGMC_70  PGMC_71  PGMC_72 
##     16.8     17.5     18.6     16.7     18.0     23.3     23.0     16.1 
##  PGMC_73  PGMC_74  PGMC_75  PGMC_76  PGMC_77  PGMC_78  PGMC_79  PGMC_80 
##     19.4     19.1     20.1     18.7     23.1     18.3     21.6     20.6 
##  PGMC_81  PGMC_82  PGMC_83  PGMC_84  PGMC_85    ALS_1    CTR_1    CTR_2 
##     19.1     19.7     22.9     28.1     25.4     18.2     17.9     17.6 
##    ALS_2    CTR_3    ALS_3  mimic_1    CTR_4    ALS_4    ALS_5    ALS_6 
##     16.0     17.8     17.9     20.9     15.7     16.8     15.8     17.2 
##   SYMP_1  mimic_2    ALS_7    CTR_5    CTR_6  mimic_3    ALS_8    ALS_9 
##     16.4     15.7     14.1     16.1     15.5     18.3     18.4     18.2 
##    CTR_7    CTR_8    CTR_9   SYMP_2   CTR_10   SYMP_3   CTR_11   ALS_10 
##     21.7     16.6     16.8     17.2     18.8     16.7     18.2     16.4 
##   ALS_11   ALS_12   CTR_12   SYMP_4  mimic_4   ALS_13   CTR_13   ALS_14 
##     16.5     14.8     17.4     14.8     17.4     15.4     16.0     15.4 
##   ALS_15   CTR_14  mimic_5  mimic_6  mimic_7   ALS_16   CTR_15   CTR_16 
##     16.8     17.9     18.3     20.0     19.2     21.9     18.0     17.2 
##  mimic_8   ALS_17   CTR_17   ALS_18   CTR_18  mimic_9   CTR_19   CTR_20 
##     17.0     18.4     17.6     17.2     19.8     19.3     17.5     18.5 
## mimic_10   CTR_21   CTR_22 mimic_11   ALS_19   CTR_23 mimic_12   ALS_20 
##     17.5     17.8     20.1     18.5     16.8     19.2     20.1     17.7 
##   ALS_21   ALS_22   ALS_23   ALS_24   CTR_24   CTR_25   ALS_25   CTR_26 
##     23.8     16.1     19.5     17.3     21.0     19.7     21.1     19.3 
##   CTR_27   ALS_26   ALS_27   CTR_28   CTR_29   CTR_30   CTR_31   CTR_32 
##     20.0     18.9     17.2     16.7     17.3     18.9     19.6     19.3 
##   CTR_33   ALS_28   ALS_29 mimic_13   ALS_30   CTR_34   ALS_31   CTR_35 
##     19.1     19.4     18.3     17.3     19.1     21.1     18.2     20.3 
##   ALS_32   ALS_33   CTR_36   CTR_37   ALS_34   ALS_35   ALS_36   SYMP_5 
##     18.3     18.7     17.4     20.5     16.9     19.4     17.9     19.7 
##   CTR_38   ALS_37   CTR_39   CTR_40   ALS_38 mimic_14   ALS_39 mimic_15 
##     19.1     16.1     19.1     18.7     20.9     17.9     20.0     20.4 
##   CTR_41   CTR_42   CTR_43   ALS_40   ALS_41   CTR_44   SYMP_6   ALS_42 
##     21.1     22.1     20.1     22.4     20.1     21.5     20.9     20.5 
##   ALS_43   ALS_44   SYMP_7   SYMP_8   CTR_45   ALS_45   CTR_46 mimic_16 
##     22.5     23.4     23.3     20.4     20.2     20.6     23.5     20.1 
## mimic_17   CTR_47   CTR_48   ALS_46   CTR_49   CTR_50 mimic_18   ALS_47 
##     20.0     21.6     21.0     21.6     21.3     20.6     21.3     20.2 
##   CTR_51   CTR_52 mimic_19   CTR_53   CTR_54   CTR_55   CTR_56   ALS_48 
##     21.3     21.5     20.9     21.1     21.6     22.6     21.3     21.9 
##   ALS_49   ALS_50   CTR_57   CTR_58   CTR_59   CTR_60   ALS_51   CTR_61 
##     25.6     24.0     22.8     21.6     23.1     23.2     20.6     24.3 
##   ALS_52   CTR_62   ALS_53   CTR_63   ALS_54   ALS_55   ALS_56   CTR_64 
##     20.9     21.4     23.4     22.1     24.3     22.3     25.1     23.2 
##   ALS_57 mimic_20   ALS_58   SYMP_9   ALS_59  SYMP_10   ALS_60   ALS_61 
##     22.1     21.7     22.9     22.8     22.3     24.3     22.1     23.1 
##   ALS_62   ALS_63   ALS_64   ALS_65 mimic_21 mimic_22 mimic_23   CTR_65 
##     22.4     21.4     23.7     22.0     22.8     24.7     25.2     23.4 
##   CTR_66 mimic_24 mimic_25   CTR_67   ALS_66   CTR_68   N.A._1   ALS_67 
##     22.2     23.7     22.9     24.2     22.2     25.4     21.4     23.0 
##  SYMP_11   CTR_69   ALS_68   CTR_70 mimic_26  SYMP_12   ALS_69 mimic_27 
##     23.3     21.7     23.3     23.3     21.6     22.2     26.3     22.4 
##   ALS_70  SYMP_13   CTR_71   CTR_72    NA._1   ALS_71 mimic_28    NA._2 
##     23.2     22.8     23.9     23.2     23.2     25.2     21.6     22.6 
##   ALS_72   CTR_73   CTR_74   CTR_75   ALS_73   CTR_76    NA._3   CTR_77 
##     22.9     20.1     23.1     23.9     26.0     22.3     23.3     23.5 
##   ALS_74   ALS_75   N.A._2 mimic_29  SYMP_14   CTR_78   CTR_79   ALS_76 
##     23.3     23.4     22.2     23.4     28.9     27.6     27.7     31.1 
##   CTR_80    NA._4  SYMP_15   ALS_77   ALS_78  PGMC_86   ALS_79 mimic_30 
##     29.3     25.0     25.1     27.0     27.0     25.7     28.9     24.4 
##   CTR_81   ALS_80   ALS_81   ALS_82   ALS_83   ALS_84  SYMP_16   ALS_85 
##     25.8     27.5     25.8     25.9     28.1     25.7     26.3     25.4 
##   CTR_82   ALS_86   ALS_87   CTR_83  SYMP_17   ALS_88   ALS_89  SYMP_18 
##     23.0     24.2     25.3     26.7     23.0     25.7     24.7     25.8 
## mimic_31   ALS_90 mimic_32   ALS_91 
##     26.9     27.1     27.6     23.2
round(apply(X = as.data.frame(assay(lipidomics_data_plasma$se_filtered)), function(x) sum(is.na(x)), MARGIN = 2) / nrow(as.data.frame(assay(lipidomics_data_plasma$se_filtered))) * 100 , 1)
##   PGMC_1   PGMC_2   PGMC_3   PGMC_4   PGMC_5   PGMC_6   PGMC_7   PGMC_8 
##      1.6      2.5      4.8      3.9      2.3      1.8      0.9      1.2 
##   PGMC_9  PGMC_10  PGMC_11  PGMC_12  PGMC_13  PGMC_14  PGMC_15  PGMC_16 
##      1.6      2.6      2.9      3.4      1.8      0.7      1.2      1.3 
##  PGMC_17  PGMC_18  PGMC_19  PGMC_20  PGMC_21  PGMC_22  PGMC_23  PGMC_24 
##      1.5      1.8      6.6      7.9      3.7      6.3      7.6      2.9 
##  PGMC_25  PGMC_26  PGMC_27  PGMC_28  PGMC_29  PGMC_30  PGMC_31  PGMC_32 
##      5.8      5.7      3.9      2.8      1.3      1.6      1.6      1.3 
##  PGMC_33  PGMC_34  PGMC_35  PGMC_36  PGMC_37  PGMC_38  PGMC_39  PGMC_40 
##      1.3      0.7      1.3      2.6      2.3      1.5      2.0      1.3 
##  PGMC_41  PGMC_42  PGMC_43  PGMC_44  PGMC_45  PGMC_46  PGMC_47  PGMC_48 
##      3.4      2.8      0.6      1.2      0.9      2.0      1.3      0.9 
##  PGMC_49  PGMC_50  PGMC_51  PGMC_52  PGMC_53  PGMC_54  PGMC_55  PGMC_56 
##      1.3      1.6      2.9      5.8      1.2      1.0      2.2      1.5 
##  PGMC_57  PGMC_58  PGMC_59  PGMC_60  PGMC_61  PGMC_62  PGMC_63  PGMC_64 
##      1.8      3.5      6.1      1.8      0.4      2.8      2.2      0.6 
##  PGMC_65  PGMC_66  PGMC_67  PGMC_68  PGMC_69  PGMC_70  PGMC_71  PGMC_72 
##      2.0      1.2      1.3      0.4      0.9      1.9      2.3      0.4 
##  PGMC_73  PGMC_74  PGMC_75  PGMC_76  PGMC_77  PGMC_78  PGMC_79  PGMC_80 
##      2.9      1.0      2.8      0.9      5.4      1.3      1.2      1.0 
##  PGMC_81  PGMC_82  PGMC_83  PGMC_84  PGMC_85    ALS_1    CTR_1    CTR_2 
##      0.4      1.0      1.8      7.0      4.2      2.8      0.9      1.8 
##    ALS_2    CTR_3    ALS_3  mimic_1    CTR_4    ALS_4    ALS_5    ALS_6 
##      0.4      0.9      0.9      4.2      1.0      0.6      0.7      1.9 
##   SYMP_1  mimic_2    ALS_7    CTR_5    CTR_6  mimic_3    ALS_8    ALS_9 
##      1.0      0.6      0.9      1.3      0.7      2.3      1.9      1.9 
##    CTR_7    CTR_8    CTR_9   SYMP_2   CTR_10   SYMP_3   CTR_11   ALS_10 
##      5.8      0.3      0.9      0.9      1.3      1.0      1.5      0.9 
##   ALS_11   ALS_12   CTR_12   SYMP_4  mimic_4   ALS_13   CTR_13   ALS_14 
##      0.6      0.1      1.0      0.4      1.8      0.7      0.1      0.4 
##   ALS_15   CTR_14  mimic_5  mimic_6  mimic_7   ALS_16   CTR_15   CTR_16 
##      0.7      2.0      0.3      2.9      1.9      4.1      0.6      1.0 
##  mimic_8   ALS_17   CTR_17   ALS_18   CTR_18  mimic_9   CTR_19   CTR_20 
##      0.9      1.6      1.0      1.3      1.9      2.0      2.3      1.6 
## mimic_10   CTR_21   CTR_22 mimic_11   ALS_19   CTR_23 mimic_12   ALS_20 
##      0.9      1.5      2.2      0.9      0.3      1.5      1.3      0.6 
##   ALS_21   ALS_22   ALS_23   ALS_24   CTR_24   CTR_25   ALS_25   CTR_26 
##      6.3      0.6      1.3      0.3      2.8      1.9      3.9      1.5 
##   CTR_27   ALS_26   ALS_27   CTR_28   CTR_29   CTR_30   CTR_31   CTR_32 
##      1.9      1.0      0.6      0.4      0.4      2.2      2.0      1.8 
##   CTR_33   ALS_28   ALS_29 mimic_13   ALS_30   CTR_34   ALS_31   CTR_35 
##      2.2      1.5      1.9      0.4      2.5      2.6      1.0      3.5 
##   ALS_32   ALS_33   CTR_36   CTR_37   ALS_34   ALS_35   ALS_36   SYMP_5 
##      0.7      1.0      0.4      4.1      0.9      2.6      1.6      1.5 
##   CTR_38   ALS_37   CTR_39   CTR_40   ALS_38 mimic_14   ALS_39 mimic_15 
##      1.6      0.1      1.0      2.0      2.9      0.9      1.2      1.3 
##   CTR_41   CTR_42   CTR_43   ALS_40   ALS_41   CTR_44   SYMP_6   ALS_42 
##      1.2      2.0      0.7      1.5      1.2      2.2      0.9      1.2 
##   ALS_43   ALS_44   SYMP_7   SYMP_8   CTR_45   ALS_45   CTR_46 mimic_16 
##      2.8      3.8      2.9      0.6      1.0      2.0      3.9      0.6 
## mimic_17   CTR_47   CTR_48   ALS_46   CTR_49   CTR_50 mimic_18   ALS_47 
##      0.9      1.2      0.9      2.2      1.9      0.6      0.9      1.2 
##   CTR_51   CTR_52 mimic_19   CTR_53   CTR_54   CTR_55   CTR_56   ALS_48 
##      0.7      1.0      0.9      0.6      1.2      2.5      0.9      1.2 
##   ALS_49   ALS_50   CTR_57   CTR_58   CTR_59   CTR_60   ALS_51   CTR_61 
##      5.4      3.5      1.8      1.6      1.0      2.5      0.6      4.1 
##   ALS_52   CTR_62   ALS_53   CTR_63   ALS_54   ALS_55   ALS_56   CTR_64 
##      0.6      1.3      4.5      1.9      3.1      1.3      3.9      2.3 
##   ALS_57 mimic_20   ALS_58   SYMP_9   ALS_59  SYMP_10   ALS_60   ALS_61 
##      1.5      1.6      2.3      1.9      1.9      2.9      1.6      2.0 
##   ALS_62   ALS_63   ALS_64   ALS_65 mimic_21 mimic_22 mimic_23   CTR_65 
##      1.8      0.7      2.0      1.3      1.9      4.5      3.9      2.2 
##   CTR_66 mimic_24 mimic_25   CTR_67   ALS_66   CTR_68   N.A._1   ALS_67 
##      1.5      2.6      2.0      2.9      2.2      3.9      1.2      1.9 
##  SYMP_11   CTR_69   ALS_68   CTR_70 mimic_26  SYMP_12   ALS_69 mimic_27 
##      2.2      1.2      3.1      2.6      1.2      1.8      6.0      1.5 
##   ALS_70  SYMP_13   CTR_71   CTR_72    NA._1   ALS_71 mimic_28    NA._2 
##      1.9      2.2      3.1      2.2      2.6      4.5      1.2      1.8 
##   ALS_72   CTR_73   CTR_74   CTR_75   ALS_73   CTR_76    NA._3   CTR_77 
##      1.6      0.9      2.8      3.5      6.1      2.2      2.0      2.8 
##   ALS_74   ALS_75   N.A._2 mimic_29  SYMP_14   CTR_78   CTR_79   ALS_76 
##      3.4      2.6      1.5      2.5      8.0      6.7      6.4     10.8 
##   CTR_80    NA._4  SYMP_15   ALS_77   ALS_78  PGMC_86   ALS_79 mimic_30 
##      9.1      4.2      4.2      6.6      5.7      4.8      8.8      3.2 
##   CTR_81   ALS_80   ALS_81   ALS_82   ALS_83   ALS_84  SYMP_16   ALS_85 
##      4.5      7.0      4.7      4.7      7.9      4.7      4.7      4.5 
##   CTR_82   ALS_86   ALS_87   CTR_83  SYMP_17   ALS_88   ALS_89  SYMP_18 
##      2.2      3.4      4.4      5.4      2.5      4.7      3.7      4.2 
## mimic_31   ALS_90 mimic_32   ALS_91 
##      6.3      6.7      6.4      2.6
#normalization
lipidomics_data_plasma$se_filt_norm <- normalize_vsn(lipidomics_data_plasma$se_filtered)
DEP::meanSdPlot(lipidomics_data_plasma$se_filt_norm)

writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filtered)),"results/data_filtered_plasma.xlsx")
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filt_norm)),"results/data_filtered_normalized_plasma.xlsx")

# imputation
lipidomics_data_plasma$se_filt_impute <- impute(
  lipidomics_data_plasma$se_filt_norm,fun = "MinProb",q = 0.01)
## Imputing along margin 2 (samples/columns).
## [1] 0.6924497
writexl::write_xlsx(as.data.frame(assay(lipidomics_data_plasma$se_filt_impute)),"results/data_filtered_normalized_imputed_plasma.xlsx")

Step 7: Visualizations

The seventh step is to create visualizations, such as heatmaps and PCAs (for each fluid with ID/batch info, and for both fluids together)

- CSF

## Distribution of expression values per ID and tube number
mean_expression_plot(data = t(assay(lipidomics_data_CSF$se)), 
                      file_sample = "plots/boxplots_expression_each_sample_CSF.pdf",
                      file_mass = "plots/boxplots_expression_each_lipid_CSF.pdf",
                      title = "CSF lipidomics")

## Heatmaps
# CSF
title = "lipidomics CSF" 

data = assay(lipidomics_data_CSF$se)
#row annotation
names_lipids = rownames(data)
Lipids_final <- read_excel("data input/RE_ metabolomics/Lipids_final.xlsx")
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])
        
#without grouping, all lipids
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap all lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered lipids
data = assay(lipidomics_data_CSF$se_filt_norm)

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_filtered_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered and imputed
data = assay(lipidomics_data_CSF$se_filt_impute)

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_",title,".pdf"))
## quartz_off_screen 
##                 2
# without grouping, 100 most variable lipids
d = assay(lipidomics_data_CSF$se)
d2 = head(order(rowVars(d),decreasing = T),100)

data = d[d2,]
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = d[d2,], 
                  cluster_cols = F, 
                  main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
                  labels_col = colnames(d))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_mostvar_",title,".pdf"))
## quartz_off_screen 
##                 2
## UMAP (with participant status type)
d_CSF = t(assay(lipidomics_data_CSF$se_filt_impute))
labels_group = tube_batch_CSF$type
group = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4","orange3")
UMAP_density_plot(data = d_CSF,
                          ggtitle = paste0("UMAP with type labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_type_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_type_labels_",title,".pdf"),
                          colour_set = group)

# UMAP (with batch info)
labels_group = tube_batch_CSF$Batch
group = c("#3DB7E4", "#FF8849", "#69BE28")
UMAP_density_plot(data = d_CSF,
                          ggtitle = paste0("UMAP with batch labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_batch_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_batch_labels_",title,".pdf"),
                          colour_set = group)

- Plasma

## Distribution of expression values per ID and tube number
mean_expression_plot(data = t(assay(lipidomics_data_plasma$se)), 
                      file_sample = "plots/boxplots_expression_each_sample_plasma.pdf",
                      file_mass = "plots/boxplots_expression_each_lipid_plasma.pdf",
                      title = "plasma lipidomics")

## Heatmaps
# plasma
title = "lipidomics plasma" 

data = assay(lipidomics_data_plasma$se)
#row annotation
names_lipids = rownames(data)
Lipids_final <- read_excel("data input/RE_ metabolomics/Lipids_final.xlsx")
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])
        
#without grouping, all lipids
p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap all lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered lipids
data = assay(lipidomics_data_plasma$se_filt_norm)

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap filtered lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_filtered_",title,".pdf"))
## quartz_off_screen 
##                 2
#without grouping, filtered and imputed
data = assay(lipidomics_data_plasma$se_filt_impute)

#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = data, 
                  cluster_cols = F, 
                  main = paste0("Heatmap imputed lipids\n",title, "\n not clustered"), 
                  show_rownames = F,
                  labels_col = colnames(data))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_imputed_",title,".pdf"))
## quartz_off_screen 
##                 2
# without grouping, 100 most variable lipids
d = assay(lipidomics_data_plasma$se)
d2 = head(order(rowVars(d),decreasing = T),100)

data = d[d2,]
#row annotation
names_lipids = rownames(data)
lipids = Lipids_final %>% 
  select(Name,Category) %>%
  filter(Name %in% names_lipids)
lipids = lipids[match(lipids$Name,names_lipids),]
annotation_row = data.frame(lipid_type = as.factor(lipids$Category))
rownames(annotation_row) = lipids$Name
mycolors = brewer.pal(length(unique(annotation_row$lipid_type)), "Set1")
names(mycolors) <- unique(annotation_row$lipid_type)
annotation_colours <- list(
  lipid_type = list(mycolors)[[1]])

p = make_pheatmap(data = d[d2,], 
                  cluster_cols = F, 
                  main = paste0("Heatmap 100 most variable lipids\n",title, "\nnot clustered"),
                  labels_col = colnames(d))

save_pheatmap_pdf(p, filename = paste0("plots/heatmap_raw_mostvar_",title,".pdf"))
## quartz_off_screen 
##                 2
## UMAP (with participant status type)
d_plasma = t(assay(lipidomics_data_plasma$se_filt_impute))
labels_group = tube_batch_plasma$type
group = c("mediumpurple1", "darksalmon", "yellow4","blue4","seagreen4","orange3")
UMAP_density_plot(data = d_plasma,
                          ggtitle = paste0("UMAP with type labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_type_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_type_labels_",title,".pdf"),
                          colour_set = group)

# UMAP (with batch info)
labels_group = tube_batch_plasma$Batch
group = c("#3DB7E4", "#FF8849", "#69BE28","#757083","#CF7684","#A0C8C0","#CDAE3B")
UMAP_density_plot(data = d_plasma,
                          ggtitle = paste0("UMAP with batch labels\n", title),
                          legend_name = "Fluid labels",
                          labels = labels_group,
                          file_location = paste0("plots/UMAP_batch_",title,".pdf"),
                          file_location_labels =paste0("plots/UMAP_batch_labels_",title,".pdf"),
                          colour_set = group)

  • CSF and Plasma together
 proteins_in_both_fluids = colnames(d_CSF)[colnames(d_CSF) %in% colnames(d_plasma)]
  
  d_1 = d_CSF[,proteins_in_both_fluids]
  d_2 = d_plasma[,proteins_in_both_fluids]
  
  d = do.call("rbind",list(d_1,d_2))
  
  labels_group = c(rep("CSF", 144), rep("Plasma", 316))
  title = "plasma_vs_CSF"
  d <- is.na(d)
  
  dim(d)
## [1] 460 183
#perform plots with function
UMAP_density_plot(data = d,
                  ggtitle = paste0("UMAP with fluid labels\n", title),
                  legend_name = "Fluid labels",
                  labels = labels_group,
                  file_location = paste0("plots/UMAP_fluid_group_lipids_",title,".pdf"),
                  file_location_labels=paste0("plots/UMAP_fluid_group_labels_",title,".pdf"),
                  colour_set = group)

Step 8

The eighth step is to perform differential expression analyses, with particular interest of eALS vs CTR, PGMC vs CTR. For this, the information between IDs and the status (eALS, CTR, PGMC, mimic) have to be collected from other datasets